knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)

Get the data

nypd = read.csv("https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD")

Clean the data

#Remove the columns that I am not interested in for this project
nypd <- nypd %>% 
  select(-c(Latitude, Longitude, Lon_Lat, X_COORD_CD, Y_COORD_CD))

#Create some new columns that I will use for analysis
nypd <- nypd %>% 
  mutate(OCCUR_DATE = mdy(OCCUR_DATE),
         OCCUR_TIME = hms(OCCUR_TIME),
         OCCUR_HOUR = as.integer(OCCUR_TIME@hour),
         OCCUR_MONTH = as.integer(month(OCCUR_DATE)),
         OCCUR_YEAR = year(OCCUR_DATE),
         STATISTICAL_MURDER_FLAG = as.logical(STATISTICAL_MURDER_FLAG),
         SHOOTINGS = 1,
         MURDER = as.integer(STATISTICAL_MURDER_FLAG))

Handle Missing Data

#Handle missing data

nypd[nypd == ''] <- NA

nypd <- nypd %>%
  replace_na(list(PERP_AGE_GROUP = "UNKNOW", PERP_RACE = "UNKNOWN", PERP_SEX = "UNKNOWN",  VIC_AGE_GROUP = "UNKNOWN", VIC_SEX="UNKNOWN", VIC_RACE="UNKNOWN"))

Borough Analysis

Barplot for total shootings in each borough

#color plot
ggplot(nypd, aes(x=as.factor(BORO), fill=as.factor(BORO))) +
  geom_bar() +
  scale_fill_brewer(palette = "Set1") +
  theme(legend.position="none")

Stacked barplot, split by murders vs. non-murders

#Barplot by Boro, stacked by murder
ggplot(nypd, aes(BORO, fill = STATISTICAL_MURDER_FLAG)) +
  geom_bar(position = "stack")

#Barplot by Boro, stacked by murder percentage
ggplot(nypd, aes(BORO, fill = STATISTICAL_MURDER_FLAG)) +
  geom_bar(position = "fill")

Timeseries Analysis

#Create a new df for time series analysis
nypd_annual <- nypd %>%
  group_by(OCCUR_YEAR, SHOOTINGS)%>%
  summarize(SHOOTINGS = sum(SHOOTINGS),
            STATISTICAL_MURDER_FLAG = sum(STATISTICAL_MURDER_FLAG)) %>%
  select(OCCUR_YEAR, SHOOTINGS, STATISTICAL_MURDER_FLAG) %>%
  ungroup()
## `summarise()` has grouped output by 'OCCUR_YEAR'. You can override using the
## `.groups` argument.
nypd_annual_murder <- nypd %>%
  group_by(OCCUR_YEAR, SHOOTINGS)%>%
  summarize(SHOOTINGS = sum(SHOOTINGS),
            MURDER = sum(MURDER)) %>%
  select(OCCUR_YEAR, SHOOTINGS, MURDER) %>%
  ungroup()
## `summarise()` has grouped output by 'OCCUR_YEAR'. You can override using the
## `.groups` argument.
#Create a new df for boro time series analysis
nypd_annual_boro <- nypd %>%
  group_by(OCCUR_YEAR, SHOOTINGS, BORO)%>%
  summarize(SHOOTINGS = sum(SHOOTINGS),
            STATISTICAL_MURDER_FLAG = sum(STATISTICAL_MURDER_FLAG)) %>%
  select(OCCUR_YEAR, BORO, SHOOTINGS, STATISTICAL_MURDER_FLAG) %>%
  ungroup()
## `summarise()` has grouped output by 'OCCUR_YEAR', 'SHOOTINGS'. You can override
## using the `.groups` argument.
#Timeseries lineplot
ggplot(nypd_annual, aes(x=OCCUR_YEAR)) +
  geom_line(aes(y=SHOOTINGS)) +
  geom_point(aes(y=SHOOTINGS)) +
  geom_line(aes(y=STATISTICAL_MURDER_FLAG), color="red") +
  geom_point(aes(y=STATISTICAL_MURDER_FLAG), color="red") +
  labs(title = "NYPD Shootings (Black) and Murders (Red) by Year",
       x = "Year",
       y = "Shootings/Murders")

#Timeseries by boro
ggplot(nypd_annual_boro, aes(x=OCCUR_YEAR, y=SHOOTINGS, color=BORO)) +
  geom_line() +
  geom_point() +
  labs(title = "NYPD Shootings by Year by Boro",
       x = "Year",
       y = "Shootings")

Time of year and time of day

#histogram by hour

ggplot(nypd, aes(x=as.factor(OCCUR_HOUR))) +
  geom_bar(fill="blue") 

#histogram by month

ggplot(nypd, aes(x=as.factor(OCCUR_MONTH))) +
  geom_bar(fill="red") 

2D Histograms/Contour plots: Shootings by month and time of day

#2d histogram by month and time of day
fig1 <- plot_ly(x = nypd$OCCUR_HOUR, y = nypd$OCCUR_MONTH, type = "histogram2dcontour")

fig1 <- fig1 %>% 
        colorbar(title = "Shootings") %>%
        layout(title = 'Time of Day vs. Month', plot_bgcolor = "#e5ecf6", xaxis=list(title = "Time (Local)"), yaxis = list(title = "Month"))

fig1
#putting it all together
s <- subplot(
  plot_ly(x = nypd$OCCUR_HOUR, type = "histogram"),
  plotly_empty(),
  plot_ly(x = nypd$OCCUR_HOUR, y = nypd$OCCUR_MONTH, type = "histogram2dcontour"),
  plot_ly(y = nypd$OCCUR_MONTH, type = "histogram"),
  nrows = 2, heights = c(0.2, 0.8), widths = c(0.8, 0.2), margin = 0,
  shareX = TRUE, shareY = TRUE, titleX = FALSE, titleY = FALSE
)
## Warning: No trace type specified and no positional attributes specified
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
fig <- layout(s, showlegend = FALSE)
fig

Logistic Regression Model

I decided to create a basic logistic regression model to try to predict whether a shooting was a murder or not. Logistic regressions are a generalized linear regression model for classification. In this case, it is binary since it is either a murder (1) or not a murder (0).

Create a new dataframe that makes the categorical data factors and filters out the Unknown data.

nypd_factor <- nypd %>%
  filter(PERP_SEX == 'M' | PERP_SEX == 'F') %>%
  filter(PERP_RACE != "UNKNOWN")%>%
  filter(PERP_AGE_GROUP != "UNKNOWN")%>%
  filter(VIC_SEX == 'M' | VIC_SEX == 'F') %>%
  filter(VIC_RACE != "UNKNOWN")%>%
  filter(VIC_AGE_GROUP != "UNKNOWN")%>%
  mutate(PERP_AGE_GROUP = as.factor(PERP_AGE_GROUP))%>%
  mutate(PERP_SEX = as.factor(PERP_SEX))%>%
  mutate(PERP_RACE = as.factor(PERP_RACE))%>%
  mutate(VIC_AGE_GROUP = as.factor(VIC_AGE_GROUP))%>%
  mutate(VIC_SEX = as.factor(VIC_SEX))%>%
  mutate(VIC_RACE = as.factor(VIC_RACE))

Create the logistic regression model

log_reg <- glm(MURDER ~ OCCUR_HOUR + OCCUR_MONTH + PERP_SEX +PERP_AGE_GROUP + PERP_RACE + VIC_AGE_GROUP + VIC_SEX + VIC_RACE, data = nypd_factor, family = "binomial")

summary(log_reg) 
## 
## Call:
## glm(formula = MURDER ~ OCCUR_HOUR + OCCUR_MONTH + PERP_SEX + 
##     PERP_AGE_GROUP + PERP_RACE + VIC_AGE_GROUP + VIC_SEX + VIC_RACE, 
##     family = "binomial", data = nypd_factor)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -24.396289 295.041851  -0.083  0.93410    
## OCCUR_HOUR                         -0.001751   0.002470  -0.709  0.47838    
## OCCUR_MONTH                         0.007696   0.006330   1.216  0.22405    
## PERP_SEXM                          -0.140780   0.114224  -1.232  0.21777    
## PERP_AGE_GROUP1020                -11.151815 324.743707  -0.034  0.97261    
## PERP_AGE_GROUP18-24                 0.133598   0.073666   1.814  0.06975 .  
## PERP_AGE_GROUP224                 -11.267741 324.743710  -0.035  0.97232    
## PERP_AGE_GROUP25-44                 0.402200   0.075045   5.359 8.35e-08 ***
## PERP_AGE_GROUP45-64                 0.716878   0.112075   6.396 1.59e-10 ***
## PERP_AGE_GROUP65+                   0.855667   0.282559   3.028  0.00246 ** 
## PERP_AGE_GROUP940                 -11.321904 324.743710  -0.035  0.97219    
## PERP_RACEASIAN / PACIFIC ISLANDER  11.895818 229.569261   0.052  0.95867    
## PERP_RACEBLACK                     11.509026 229.569185   0.050  0.96002    
## PERP_RACEBLACK HISPANIC            11.410831 229.569196   0.050  0.96036    
## PERP_RACEWHITE                     12.085766 229.569229   0.053  0.95801    
## PERP_RACEWHITE HISPANIC            11.609942 229.569191   0.051  0.95967    
## VIC_AGE_GROUP1022                 -10.979533 324.743705  -0.034  0.97303    
## VIC_AGE_GROUP18-24                  0.222279   0.076738   2.897  0.00377 ** 
## VIC_AGE_GROUP25-44                  0.304456   0.076236   3.994 6.51e-05 ***
## VIC_AGE_GROUP45-64                  0.308698   0.100047   3.086  0.00203 ** 
## VIC_AGE_GROUP65+                    0.495994   0.218130   2.274  0.02298 *  
## VIC_SEXM                           -0.067592   0.062717  -1.078  0.28115    
## VIC_RACEASIAN / PACIFIC ISLANDER   11.493414 185.331308   0.062  0.95055    
## VIC_RACEBLACK                      11.349646 185.331248   0.061  0.95117    
## VIC_RACEBLACK HISPANIC             11.197579 185.331260   0.060  0.95182    
## VIC_RACEWHITE                      11.349346 185.331286   0.061  0.95117    
## VIC_RACEWHITE HISPANIC             11.458032 185.331255   0.062  0.95070    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15321  on 13939  degrees of freedom
## Residual deviance: 15111  on 13913  degrees of freedom
## AIC: 15165
## 
## Number of Fisher Scoring iterations: 11

Conclusion and Bias Identification

In this report, I did some analysis on NYPD shooting trends. There appeared to be a steady decline in both shootings and murders until 2020, when there was a sharp increase. Additionally, we examined the time of year and time of day when shootings are most likely to occur. The data show that the summertime and during the evening/nighttime hours are the most dangerous. Finally, I created a logistic regression model to predict whether a shooting was a murder or not. The only statistically significant predictors from the candidate predictors were related to the age of the perp and victim.

There are a few areas of potential bias in these data. There are likely shootings that go unreported to the NYPD, and hence would not be in this data. There may be some boroughs that report shootings at a higher rate than others.